In [1]:
import json
import datetime
import numpy as np
folder = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/outfiles'
counts = np.zeros(10000)
count = 0
for file_num in range(793):
if file_num < 670 and file_num > 659:
continue
filename = folder + '/file' + str(file_num) + '.txt'
for line in open(filename):
line_obj = json.loads(line)
for code_block in line_obj['CodeBlocks']:
counts[:len(code_block['code'])] += 1
#if len(code_block) > 5000:
# count += 1
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(counts)
Out[2]:
In [3]:
code_longer_than_threshold = []
for file_num in range(793):
filename = folder + '/file' + str(file_num) + '.txt'
for line in open(filename):
line_obj = json.loads(line)
for code_block in line_obj['CodeBlocks']:
if len(code_block['code']) > 10000:
code_longer_than_threshold.append(code_block['code'])
In [5]:
len(code_longer_than_threshold)
Out[5]:
In [26]:
experiment_code = code_longer_than_threshold[:2000]
In [27]:
import os
threshold = '10000_run2000'
os.mkdir('plagiarism/plag' + str(threshold))
os.mkdir('plagiarism/plag' + str(threshold) + '/inputs')
os.mkdir('plagiarism/plag' + str(threshold) + '/outputs')
In [28]:
import os
for i in range(len(experiment_code)):
base_name = 'plagiarism/plag' + str(threshold) + '/inputs/file' + str(i) + '.py'
f = open(base_name,'w')
f.write(experiment_code[i])
f.close()
In [30]:
import os
jar_file = '../jplag-2.11.9-SNAPSHOT-jar-with-dependencies.jar'
lang = 'python3'
results = 'ouputs'
inputs = 'inputs'
command = "java -jar " + jar_file + " -l " + lang + " -r " + results + " -s " + inputs + " -m 200"
bash_file_text = '''start=$(date +%s)
touch experiment_cleaned.out
''' + command + " > experiment_cleaned.out" + '''
end=$(date +%s)
runtime=$(python -c "print ('%u:%02u' % ((${end} - ${start})/60, (${end} - ${start})%60))")
echo $runtime
'''
bash_file = open('plagiarism/plag' + str(threshold) + '/run.sh', 'w')
bash_file.write(bash_file_text)
Out[30]:
In [33]:
#Running at a threshold of 10000, with 100 files: 1:44
#Running at a threshold of 10000, with 200 files: 3:32
#Running at a threshold of 10000, with 500 files: 6:16
#Running at a threshold of 10000, with 1000 files: 9:33
#Running at a threshold of 10000, with 2000 files: 17:49
#Running at a threshold of 10000, with ~3800 files: 46:54
results = [100,200,500,1000,2000,3850]
results_y = [104, 212, 376, 573, 1069, 2814]
plt.scatter(results, results_y)
plt.xlabel('Number of Files')
plt.ylabel('Number of seconds to completion')
Out[33]:
In [ ]:
In [ ]: